import glob
import urllib2
import sys
import re
import datetime
import os
import hashlib
from traceback import format_exc as tracebackformat_exc
from HTMLParser import HTMLParser
from cgi import escape as cgiescape
import multiprocessing as mp
import time
from shutil import copyfile as shutilcopyfile

def wos_login(workdir):
    # parameters
    fn_error_log = workdir + "/" + "wos_error.log"
    auth_req = open("download_auth.xml","r").read()
    str_auth_check = open("auth_check.xml","r").read()
    str_auth_check = str_auth_check.replace("\n",'').replace("\r",'')
    str_auth_check += "$"
    re_auth_check = re.compile(str_auth_check)
    wok_url = "http://search.isiknowledge.com/esti/wokmws/ws/"
    auth_url = wok_url + "WOKMWSAuthenticate"
    # query
    req = urllib2.Request(url = auth_url, data = auth_req)
    ff = urllib2.urlopen(req)
    resp = ff.read()
    # check response
    m = re_auth_check.match(resp)
    if m:
        sid = m.group('sid')
        out_str = ""
    else:
        sid = ""
        out_str = repr(datetime.datetime.now()) 
        out_str += ":: " + "Login failed"
        out_str += ":: " + auth_url
        out_str += ":: " + auth_req
        out_str += ":: " + resp
        open(fn_error_log,'w').write(re.sub('[\r\n]','',out_str)+'\n')
        raise ValueError, "login failed: "+resp
    return sid


def wos_download(sid_file, searchparams, download_dir, q_g_counter):
    sid = open(sid_file,"r").read()
    re_rf = re.compile("<recordsFound>(?P<rf>[0-9]+)</recordsFound>")
    task_url, edition, userquery, resp_count, firstrecord = searchparams
    wok_url = "http://search.isiknowledge.com/esti/wokmws/ws/"
    xml_fn = hashlib.sha224(repr(searchparams)).hexdigest()
    xml_fn_path = download_dir + "/" + xml_fn
    out_str = ""
    out_str += "task_url="+str(task_url)+"\n"
    out_str += "edition="+str(edition)+"\n"
    out_str += "userquery="+str(userquery)+"\n"
    out_str += "resp_count="+str(resp_count)+"\n"
    out_str += "firstrecord="+str(firstrecord)+"\n"
    if os.path.exists(xml_fn_path+".ini"):
        ini_fn_str = open(xml_fn_path+".ini","r").read()
        if ini_fn_str != out_str:
            if not os.path.exists("err_log" + "/" + str(q_g_counter)):
                os.mkdirs("err_log" + "/" + str(q_g_counter))
            open("err_log"+"/" + str(q_g_counter) +"/" + xml_fn + ".ini2","w").write(out_str)
            open("err_log"+"/" + str(q_g_counter) +"/" + xml_fn + ".ini","w").write(ini_fn_str)
    else:
        open(xml_fn_path+".ini","w").write(out_str)
    if os.path.exists(xml_fn_path+".done"):
        recordsfound = re_rf.search(open(xml_fn_path+".done").read()).group('rf')
        if os.path.exists(xml_fn_path+".log"):
            log_fn_str = open(xml_fn_path+".log").read()
            if log_fn_str == out_str:
                os.remove(xml_fn_path+".ini")
            else:
                if not os.path.exists("err_log"+"/" + str(q_g_counter)):
                    os.mkdirs("err_log"+"/" + str(q_g_counter))
                open("err_log"+"/" + str(q_g_counter) +"/" + xml_fn + ".log2","w").write(out_str)
            open("err_log"+"/" + str(q_g_counter) +"/" + xml_fn + ".log","w").write(log_fn_str)
        return xml_fn, recordsfound, searchparams
    #
    #
    req_xml = open(download_dir+"/"+"wok_search.xml","r").read()
    req_xml = req_xml.replace("<edition>MYEDITION</edition>",
                              "<edition>"+edition+"</edition>")
    req_xml = req_xml.replace("<firstRecord>MYFIRSTRECORD</firstRecord>",
                              "<firstRecord>"+str(firstrecord)+"</firstRecord>")
    req_xml = req_xml.replace("<count>MYCOUNT</count>",
                              "<count>"+str(resp_count)+"</count>")
    req_xml = req_xml.replace('<userQuery>MYUSERQUERY</userQuery>',
                              '<userQuery>'+userquery+'</userQuery>')
    req = urllib2.Request(url = wok_url + task_url, data = req_xml)
    req.add_header('Encoding', 'UTF-8')
    req.add_header('content-type', 'text/xml')
    req.add_header('charset','UTF-8')
    req.add_header('domain','null')
    req.add_header('path','null')
    req.add_header('Cookie', 'SID="%s"' % sid)
    resp = ""
    recordsfound = 0
    try:
        url_ff = urllib2.urlopen(req,timeout=300)
        resp = url_ff.read()
    except urllib2.URLError, err:
        err_out_str = out_str
        err_out_str += 'Errors:\n'
        err_out_str += str(err)+"\n"
        err_out_str += tracebackformat_exc()+"\n"
        err_out_str += err.read()
        open(xml_fn_path+".err","w").write(err_out_str)
        open(xml_fn_path+".wokxml","w").write(req_xml)
    else:
        recordsfound = re_rf.search(resp).group('rf')
        if int(recordsfound) > 100000:
            open(xml_fn_path+".wokxml","w").write(req_xml)
            open(xml_fn_path+".err","w").write(out_str)
            open(xml_fn_path+".100","w").write(out_str)
            recordsfound = '1'
        else:
            open(xml_fn_path+".done","w").write(resp)
            open(xml_fn_path+".log","w").write(out_str)
            open(xml_fn_path+".wokxml_ok","w").write(req_xml)
            os.remove(xml_fn_path+".ini")
        if int(recordsfound) < 1:
            open(xml_fn_path+".wokxml","w").write(req_xml)
    return xml_fn, recordsfound, searchparams


def wos_decode(resp):
    return HTMLParser().unescape(resp)


def usage(argv):
    missing_args = "Usage: python " + argv[0] + " [--login|--download]"
    raise ValueError, missing_args
    return 1


def proc_args(argv):
    if len(argv) < 2:
        usage(argv)
    else:
        work = argv[1]
        if work not in ['--login','--download']:
            usage(argv)
    return work


def set_download_dir(workdir):
    download_dir = workdir + "/raw_xml"
    os.system('mkdir -p '+download_dir)
    shutilcopyfile("wok_search.xml", 
                   download_dir+"/wok_search.xml")
    return download_dir


def set_userquery():
    task_url = "WokSearchLite"
    task_url = "WokSearch"
    edition = "SCI"
    edition = "SSCI"
    edition = "AHCI"
    userquery = """ TS=(cadmium OR lead)  """
    resp_count = 100 # 0: summary, 100: max value
    firstrecord = 1 # 0 < f < recordsSearched
    searchparams = [task_url, edition, userquery, resp_count, firstrecord]
    return searchparams


def set_userquery_list():
    task_url = "WokSearchLite"
    task_url = "WokSearch"
    userquery = """ TS=(cadmium OR lead)  """
    edition_list = ["SCI","SSCI","AHCI"]
    journal_list = open("source_list.txt","r").read().split('\n')
    journal_list = [x for x in journal_list if len(x)>1]
    resp_count = 100 # 0: summary, 100: max value
    firstrecord = 1 # 0 < f < recordsSearched
    searchparams_list = []
    for edition in edition_list:
        for jour_name in journal_list:
            jn = jour_name
            jn = cgiescape(jn, quote=True)
            jn = jn.encode('ascii', 'xmlcharrefreplace')
            userquery = 'SO=' + '"' + jn + '"'
            searchparams = [task_url, 
                            edition,
                            userquery, 
                            resp_count, 
                            firstrecord]
            searchparams_list.append(searchparams)
    return searchparams_list

if __name__ == "__main__":
    workdir = 'download/'
    proc_num = 100
    wos_tr_time = 0.61
    feedback_time = 60
    t0 = datetime.datetime.now()
    t_lastfb = t0
    print "Start:",t0
    os.system('mkdir -p ' + workdir)
    task_list = proc_args(sys.argv)
    sid_file = workdir + "wos_sid.txt"
    if "--login" in task_list:
        sid = wos_login(workdir)
        open(sid_file,"w").write(sid)
        print "sid written to %s" % sid_file
        print "actual sid: %s" % sid
    elif "--download" in task_list:
        sid = open(sid_file,"r").read()
        download_dir = set_download_dir(workdir)
        searchparams_list = set_userquery_list()
        #
        proc_pool = mp.Pool(processes = proc_num)
        par_pool = []
        print "Downloading"
        query_global_counter = 0
        q_g_counter = 0
        for sp_idx,sp in enumerate(searchparams_list):
            par_pool.append(proc_pool.apply_async(wos_download,(sid_file, sp, download_dir, q_g_counter)))
            time.sleep(wos_tr_time) # wos throttling
            query_global_counter += 1
            q_g_counter += 1
            if query_global_counter > 9000:
                sid = wos_login(workdir)
                open(sid_file,"w").write(sid)
                query_global_counter = 0
            t1 = datetime.datetime.now()
            if (t1-t_lastfb).seconds > feedback_time:
                print "List:",t1-t0,sp_idx,len(searchparams_list),len(searchparams_list)/(sp_idx+1)*(t1-t0)+t0,t1
                t_lastfb = t1
        rv_pool = []
        print "Queued jobs"
        while len(par_pool)>0:
            for n,rr in enumerate(par_pool):
                if rr.ready():
                    xml_fn, recordsfound, searchparams  = rr.get()
                    task_url, edition, userquery, resp_count, firstrecord = searchparams
                    s_list = [[task_url, edition, userquery, resp_count, fr] for fr in xrange(101,int(recordsfound),100)]
                    # s_list = s_list[:10]
                    for sp in s_list:
                        rv_pool.append(proc_pool.apply_async(wos_download,(sid_file, sp, download_dir, q_g_counter)))
                        time.sleep(wos_tr_time) # wos throttling
                        query_global_counter += 1
                        q_g_counter += 1
                        if query_global_counter > 9000:
                            sid = wos_login(workdir)
                            open(sid_file,"w").write(sid)
                            query_global_counter = 0
                        t1 = datetime.datetime.now()
                        if (t1-t_lastfb).seconds > feedback_time:
                            print "Queue:",t1-t0,t1,"rv:",len(rv_pool),"par:",len(par_pool)
                            t_lastfb = t1
            par_pool = [x for x in par_pool if not x.ready()]
            rv_pool = [rr for rr in rv_pool if not rr.ready()]
            time.sleep(1)
            t1 = datetime.datetime.now()
            if (t1-t_lastfb).seconds > feedback_time:
                print "Queue2:",t1-t0,t1,"rv:",len(rv_pool),"par:",len(par_pool)
                t_lastfb = t1
        while len(rv_pool)>0:
            rv_pool = [rr for rr in rv_pool if not rr.ready()]
            time.sleep(1)
            t1 = datetime.datetime.now()
            if (t1-t_lastfb).seconds > feedback_time:
                print "Queue3:",t1-t0,t1,"rv:",len(rv_pool),"par:",len(par_pool)
                t_lastfb = t1
        print "Finish"
        proc_pool.close()
        proc_pool.join()
        print "Number of download errors:",len(glob.glob(download_dir+"/*.err"))
        print "Number of download done:",len(glob.glob(download_dir+"/*.done"))
    print "END"
            
